# --- SCRIPT: INDIA MODEL (v21 - 100% LOCAL) ---
# This script builds the data AND runs the analysis.
import pandas as pd
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
import os

print("--- STARTING SCRIPT: INDIA MODEL (v21 - 100% LOCAL) ---", flush=True)
ECON_REGION = "INDIA"

# --- 0. PRE-FLIGHT CHECK ---
print("\n--- PRE-FLIGHT CHECK ---")
# These are the 8 files that MUST be in the same folder
REQUIRED_FILES = [
    'F1_India.csv', 'Golf_India.csv', 'Tennis_India.csv',
    'Cricket_India.csv', 'Football_India.csv', 'Basketball_India.csv',
    'India_Recessions.csv',
    'GDP_Growth_India.csv' # <-- The new manual GDP file
]
print(f"Script is running in this folder:\n{os.getcwd()}\n")
print("Checking for required files...")
missing_files = []
for f in REQUIRED_FILES:
    if not os.path.exists(f):
        missing_files.append(f)
if missing_files:
    print(f"\n*** FATAL ERROR: SCRIPT HALTED. ***")
    print("The following files were NOT found in the folder:")
    for f in missing_files: print(f" - {f}")
    print("\nSOLUTION: Download the missing files and re-run.")
    exit()
else:
    print("...All 8 required CSV files found. Proceeding.")
# --- END OF PRE-FLIGHT CHECK ---


# --- 1. Fetch Economic Data (India) ---
print(f"\nFetching {ECON_REGION} economic data...", flush=True)
try:
    start_date = '2004-01-01'
    end_date = '2024-12-31'
    
    # Z-Variables (Controls) - This is the ONLY online fetch
    data_vix = yf.download('^INDIAVIX', start=start_date, end=end_date, interval='1wk')
    data_vix = data_vix[['Close']].rename(columns={'Close': 'VIX'})
    print("...India VIX (^INDIAVIX) fetched.", flush=True)
    
    data_nse = yf.download('^NSEI', start=start_date, end=end_date, interval='1wk')
    data_nse = data_nse[['Close']].rename(columns={'Close': 'Market'})
    print("...NIFTY 50 (^NSEI) fetched.", flush=True)
    econ_df_w = pd.concat([data_vix, data_nse], axis=1)
    econ_df_w['Market_Return'] = econ_df_w['Market'].pct_change()
    
    # Y-Variable (Recession) - Local
    india_q_rec = pd.read_csv('India_Recessions.csv', parse_dates=['DATE'])
    india_q_rec = india_q_rec.set_index('DATE').resample('QE').ffill()  # QE = Quarter End
    india_q_rec.rename(columns={'RECESSION': 'NBER_RECESSION'}, inplace=True)
    print("...India Recession data loaded.", flush=True)

    # Y-Variable (GDP) - Local
    gdp_q = pd.read_csv('GDP_Growth_India.csv', parse_dates=['DATE'])
    gdp_q = gdp_q.set_index('DATE')
    gdp_q['GDP_GROWTH'] = gdp_q['GDP_Growth_India'].pct_change(periods=1) * 100
    print("...India Real GDP data loaded from local CSV.", flush=True)

except Exception as e:
    print(f"*** FATAL ERROR: Could not process data: {e} ***", flush=True)
    exit()

# --- 2. Load Local Google Trends CSVs (India) ---
print(f"\nLoading local {ECON_REGION} Google Trends CSVs...", flush=True)
scaler = MinMaxScaler()
KEYWORDS = ['F1', 'Golf', 'Tennis', 'Cricket', 'Football', 'Basketball']
FILENAME_SUFFIX = "_India.csv"
all_trends_df = pd.DataFrame()
for kw in KEYWORDS:
    filename = f"{kw}{FILENAME_SUFFIX}"
    print(f"Attempting to read: '{filename}'", flush=True)
    try:
        kw_df = pd.read_csv(filename, skiprows=2)
        kw_df['date'] = pd.to_datetime(kw_df['Week'])
        kw_df = kw_df.set_index('date')
        data_col_name = kw_df.columns[0] 
        kw_df = kw_df.rename(columns={data_col_name: kw})
        kw_df[kw] = pd.to_numeric(kw_df[kw], errors='coerce').fillna(0)
        kw_df[kw] = scaler.fit_transform(kw_df[[kw]])
        if all_trends_df.empty: all_trends_df = kw_df[[kw]]
        else: all_trends_df = all_trends_df.join(kw_df[[kw]], how='outer')
        print(f"--- SUCCESS: Loaded '{filename}' ---", flush=True)
    except Exception as e:
        print(f"--- FAILED: Error for '{filename}': {e}. Skipping. ---", flush=True)

# --- 3. Aggregate and Build Index ---
print(f"\nConstructing the {ECON_REGION} GSSI...", flush=True)
LX_KEYWORDS = ['F1', 'Golf', 'Tennis']
MX_KEYWORDS = ['Cricket', 'Football', 'Basketball']
lx_cols = [col for col in LX_KEYWORDS if col in all_trends_df.columns]
mx_cols = [col for col in MX_KEYWORDS if col in all_trends_df.columns]
if not lx_cols or not mx_cols:
    print(f"*** FATAL ERROR: Baskets are empty. ***"); exit()
print(f"LX Basket (final): {lx_cols}")
print(f"MX Basket (final): {mx_cols}")
lx_index_w = all_trends_df[lx_cols].mean(axis=1)
mx_index_w = all_trends_df[mx_cols].mean(axis=1)
df_w = pd.DataFrame({'LX_Index': lx_index_w, 'MX_Index': mx_index_w})
df_w = df_w.join(econ_df_w[['VIX', 'Market_Return']])
epsilon = 0.01
df_w['MX_LX_Ratio'] = (df_w['MX_Index'] + epsilon) / (df_w['LX_Index'] + epsilon)

# --- 4. Resample to Quarterly & Merge ---
print("Resampling to quarterly and merging datasets...", flush=True)
df_q = df_w.resample('QE').mean()  # QE = Quarter End
final_df = df_q.join(gdp_q).join(india_q_rec)
model_df = final_df.dropna()
output_filename = f'GSSI_model_dataset_INDIA.csv'
model_df.to_csv(output_filename)
print(f"\n--- DATA BUILD COMPLETE ---", flush=True)
print(f"Your final, model-ready dataset is saved as '{output_filename}'")
print(model_df.head())

# --- 6. STARTING PHASE 4: INDIA MODEL ANALYSIS ---
print("\n\n--- STARTING PHASE 4: INDIA MODEL ANALYSIS ---")
# --- 7. Create the 'Lead' Variable ---
model_df['Recession_Next_Q'] = model_df['NBER_RECESSION'].shift(-1)
model_df = model_df.dropna(subset=['Recession_Next_Q'])

# --- 8. Descriptive Visual ---
print("\nGenerating descriptive plot (India MX/LX Ratio vs. Recessions)...")
try:
    fig, ax1 = plt.subplots(figsize=(14, 7))
    ax1.plot(model_df.index, model_df['MX_LX_Ratio'], color='blue', label='MX/LX Ratio (GSSI-India)')
    ax1.set_ylabel('GSSI (MX/LX Ratio)')
    ax1.set_xlabel('Year')
    ax1.fill_between(model_df.index, 0, 1, where=model_df['NBER_RECESSION'] == 1,
                       color='red', alpha=0.3, transform=ax1.get_xaxis_transform(), 
                       label='India Recession/Slowdown')
    ax1.legend(loc='upper left')
    plt.title('India GSSI vs. Recession/Slowdown Periods (2004-2024)')
    plt.grid(True)
    plt.savefig('GSSI_India_vs_Recessions_Plot.png')
    print("...Plot saved as 'GSSI_India_vs_Recessions_Plot.png'.")
except Exception as e:
    print(f"...Could not generate plot: {e}")

# --- 9. Model 1: The Full Logit Regression ---
print("\n--- Running India Full Logit Model (Horserace) ---")
Y = model_df['Recession_Next_Q']
X = model_df[['MX_LX_Ratio', 'VIX', 'Market_Return']] 
X = sm.add_constant(X)
X = X.dropna() 
Y = Y.loc[X.index] 
try:
    logit_model = sm.Logit(Y, X).fit(disp=0)
    print(logit_model.summary())
except Exception as e:
    print(f"--- Logit Model Failed: {e} ---")
    print("This can be due to too few recession '1's in your 'India_Recessions.csv' file.")

# --- 10. Model 2: Granger Causality (QJE Standard) ---
print("\n--- Running India Granger Causality Test ---")
try:
    # Check if we have sufficient variation for meaningful Granger test
    var_data = model_df[['GDP_GROWTH', 'MX_LX_Ratio', 'VIX']].dropna()
    
    # Print diagnostics
    print("\nGranger Test Data Diagnostics:")
    print("Number of observations:", len(var_data))
    print("GDP_GROWTH variance:", var_data['GDP_GROWTH'].var())
    print("MX_LX_Ratio variance:", var_data['MX_LX_Ratio'].var())
    print("VIX variance:", var_data['VIX'].var())
    
    if var_data.empty or var_data['GDP_GROWTH'].var() < 1e-10 or var_data['MX_LX_Ratio'].var() < 1e-10:
        print("--- Granger Test Skipped: Insufficient variation in data ---")
        print("This usually means GDP_GROWTH is constant (zeros) or missing.")
    else:
        # Run unit root test and prepare stationary series
        adf_test = adfuller(var_data['MX_LX_Ratio'])
        print(f"\nADF test p-value: {adf_test[1]:.4f}")
        if adf_test[1] > 0.05:
            print("Ratio is non-stationary. Using first difference.")
            var_data['Ratio_stationary'] = var_data['MX_LX_Ratio'].diff()
        else:
            print("Ratio is stationary. Using as-is.")
            var_data['Ratio_stationary'] = var_data['MX_LX_Ratio']
            
        # Drop any NaN introduced by differencing
        var_data = var_data.dropna()
        
        if len(var_data) >= 8:  # Need enough observations for VAR(4)
            try:
                # First try with all variables
                var_model = VAR(var_data[['GDP_GROWTH', 'Ratio_stationary', 'VIX']])
                var_results = var_model.fit(maxlags=min(4, len(var_data)//3), ic='aic')
                test_result = var_results.test_causality('GDP_GROWTH', 'Ratio_stationary', kind='f')
                print(f"\nGranger Test (H0: GSSI-India does not cause GDP Growth):")
                print(f"P-value: {test_result.pvalue:.4f}")
            except:
                # If that fails, try just GDP and Ratio
                print("\nRetrying Granger test with simplified model (GDP & Ratio only)...")
                var_model = VAR(var_data[['GDP_GROWTH', 'Ratio_stationary']])
                var_results = var_model.fit(maxlags=min(4, len(var_data)//3), ic='aic')
                test_result = var_results.test_causality('GDP_GROWTH', 'Ratio_stationary', kind='f')
                print(f"P-value: {test_result.pvalue:.4f}")
        else:
            print(f"--- Granger Test Skipped: Need at least 8 observations, have {len(var_data)} ---")

except Exception as e:
    print(f"--- Granger Causality Failed: {e} ---")
    
print("\n--- INDIA ANALYSIS COMPLETE ---")